GetWords(Predicate<String>,Predicate<String>,Predicate<String>) Метод (TextRegion)
Возвращает слова этой текстовой области.
Parameters
- wordCharacterPredicate
- Предикат символа слова, определяющий допустимые символы в словах.
- wordDelimiterPredicate
- Предикат разделителя слов, определяющий разделители слов.
- whiteSpaceCharacterPredicate
- Предикат символа пробела, определяющий символы пробела.
Return Value
Массив
TextRegion, который определяет слова.
Вот C#/VB.NET код, который демонстрирует, как извлечь из PDF страницы только слова, состоящие из букв и цифр.
''' <summary>
''' Determines whether specified symbol is word character.
''' </summary>
''' <param name="symbol">The symbol.</param>
Private Shared Function IsWordCharacter(symbol As String) As Boolean
If Vintasoft.Imaging.Text.UnicodeCharacterCollection.IsUtf32Symbol(symbol, 0) Then
Return True
End If
Return Char.IsLetterOrDigit(symbol(0)) OrElse symbol = "_"
End Function
''' <summary>
''' Determines whether specified symbol is punctuation.
''' </summary>
''' <param name="symbol">The symbol.</param>
Private Shared Function IsPunctuation(symbol As String) As Boolean
If Vintasoft.Imaging.Text.UnicodeCharacterCollection.IsUtf32Symbol(symbol, 0) Then
Return False
End If
Return Char.IsPunctuation(symbol(0))
End Function
''' <summary>
''' Determines whether specified symbol is white space.
''' </summary>
''' <param name="symbol">The symbol.</param>
Private Shared Function IsWhiteSpace(symbol As String) As Boolean
If Vintasoft.Imaging.Text.UnicodeCharacterCollection.IsUtf32Symbol(symbol, 0) Then
Return False
End If
Return Char.IsWhiteSpace(symbol(0))
End Function
''' <summary>
''' Extract words, which are consist from letters and digits, from PDF page.
''' </summary>
''' <param name="page">PDF page.</param>
''' <returns>Words, which are consist from letters and digits, from PDF page.</returns>
Public Shared Function GetLetterAndDigitWordsFromPdfPage(page As Vintasoft.Imaging.Pdf.Tree.PdfPage) As String
' get words of the page
Dim words As Vintasoft.Imaging.Text.TextRegion() = page.TextRegion.GetWords(AddressOf IsWordCharacter, AddressOf IsPunctuation, AddressOf IsWhiteSpace)
Dim result As New System.Text.StringBuilder()
For Each word As Vintasoft.Imaging.Text.TextRegion In words
result.AppendLine(word.TextContent)
Next
Return result.ToString()
End Function
/// <summary>
/// Determines whether specified symbol is word character.
/// </summary>
/// <param name="symbol">The symbol.</param>
private static bool IsWordCharacter(string symbol)
{
if (Vintasoft.Imaging.Text.UnicodeCharacterCollection.IsUtf32Symbol(symbol, 0))
return true;
return char.IsLetterOrDigit(symbol[0]) || symbol == "_";
}
/// <summary>
/// Determines whether specified symbol is punctuation.
/// </summary>
/// <param name="symbol">The symbol.</param>
private static bool IsPunctuation(string symbol)
{
if (Vintasoft.Imaging.Text.UnicodeCharacterCollection.IsUtf32Symbol(symbol, 0))
return false;
return char.IsPunctuation(symbol[0]);
}
/// <summary>
/// Determines whether specified symbol is white space.
/// </summary>
/// <param name="symbol">The symbol.</param>
private static bool IsWhiteSpace(string symbol)
{
if (Vintasoft.Imaging.Text.UnicodeCharacterCollection.IsUtf32Symbol(symbol, 0))
return false;
return char.IsWhiteSpace(symbol[0]);
}
/// <summary>
/// Extract words, which are consist from letters and digits, from PDF page.
/// </summary>
/// <param name="page">PDF page.</param>
/// <returns>Words, which are consist from letters and digits, from PDF page.</returns>
public static string GetLetterAndDigitWordsFromPdfPage(Vintasoft.Imaging.Pdf.Tree.PdfPage page)
{
// get words of the page
Vintasoft.Imaging.Text.TextRegion[] words =
page.TextRegion.GetWords(IsWordCharacter, IsPunctuation, IsWhiteSpace);
System.Text.StringBuilder result = new System.Text.StringBuilder();
foreach (Vintasoft.Imaging.Text.TextRegion word in words)
{
result.AppendLine(word.TextContent);
}
return result.ToString();
}
Целевые платформы: .NET 8; .NET 7; .NET 6; .NET Framework 4.8, 4.7, 4.6, 4.5, 4.0, 3.5